This is an initial report as a test to analyze the causality of variables of development projects (GitHub) and quality characteristics of the software (Sonar Cloud). The data has been obtained through the public api of both platforms, and the json data has been filtered and pre-processed using an intermediate Mongo documentary database.
The result of the preprocessing has been stored in a CSV file. The first step is to import the data from that file.
library(readr)
sonar_git <- read_delim("../data/sonar-git.csv", ";", quote = "\\\"", escape_double = FALSE, locale = locale(), trim_ws = TRUE)
## Parsed with column specification:
## cols(
## .default = col_double(),
## project = col_character(),
## version = col_character(),
## from = col_datetime(format = ""),
## to = col_datetime(format = ""),
## file_complexity_distribution = col_character(),
## files = col_character(),
## function_complexity_distribution = col_character(),
## functions = col_character(),
## generated_lines = col_character(),
## generated_ncloc = col_character(),
## info_violations = col_character(),
## line_coverage = col_character(),
## new_line_coverage = col_character(),
## lines = col_character(),
## ncloc = col_character(),
## lines_to_cover = col_character(),
## new_lines_to_cover = col_character(),
## sqale_rating = col_character(),
## alert_status = col_character(),
## security_hotspots = col_character()
## # ... with 1 more columns
## )
## See spec(...) for full column specifications.
## Warning: 113 parsing failures.
## row col expected actual file
## 6 -- 111 columns 123 columns '../data/sonar-git.csv'
## 7 -- 111 columns 123 columns '../data/sonar-git.csv'
## 8 -- 111 columns 123 columns '../data/sonar-git.csv'
## 9 -- 111 columns 117 columns '../data/sonar-git.csv'
## 10 -- 111 columns 117 columns '../data/sonar-git.csv'
## ... ... ........... ........... .......................
## See problems(...) for more details.
Then, we filter our the matrix (with 107 variables) to manage those interesting (after some preliminary analyises were done). With the filtered data, we sown the descriptive statistics.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataset1<-select(sonar_git,
project,
version,
commits,
changes_by_commit,
committers,
committers_weight,
bugs,
code_smells,
complexity,
violations,
duplicated_blocks,
open_issues)
summary(dataset1)
## project version commits changes_by_commit
## Length:224 Length:224 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 6.00 1st Qu.: 43.18
## Mode :character Mode :character Median : 29.00 Median : 201.02
## Mean : 76.59 Mean : 954.96
## 3rd Qu.: 73.50 3rd Qu.: 458.93
## Max. :740.00 Max. :34902.00
## committers committers_weight bugs code_smells
## Min. : 0.000 Min. :0.00000 Min. : 0.00 Min. : 0
## 1st Qu.: 1.000 1st Qu.:0.01688 1st Qu.: 0.00 1st Qu.: 109
## Median : 4.000 Median :0.06865 Median : 1.00 Median : 151
## Mean : 5.161 Mean :0.22962 Mean : 41.88 Mean : 1907
## 3rd Qu.: 6.000 3rd Qu.:0.23140 3rd Qu.: 31.00 3rd Qu.: 1081
## Max. :32.000 Max. :1.00000 Max. :923.00 Max. :40618
## complexity violations duplicated_blocks open_issues
## Min. : 0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.: 2420 1st Qu.: 185.0 1st Qu.: 18.75 1st Qu.: 2
## Median : 5770 Median : 590.5 Median : 52.00 Median : 133
## Mean : 13295 Mean : 2058.6 Mean : 130.95 Mean : 1740
## 3rd Qu.: 12033 3rd Qu.: 892.0 3rd Qu.: 86.00 3rd Qu.: 630
## Max. :143551 Max. :42591.0 Max. :1931.00 Max. :42590
First we analyse commits/committers relationship
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot(dataset1, aes(commits, project)) +
geom_boxplot(fill="gray")
p
p <- ggplot(dataset1, aes(committers, project)) +
geom_boxplot(fill="gray")
p
sp <- ggplot(dataset1, aes(x=commits, y=committers)) +
geom_point(aes(colour=project)) +
stat_density_2d(aes(fill = ..level..), geom="polygon", alpha=0.2) + scale_fill_gradient(low="green", high="red")
sp + theme_classic()
zoom_sp <- sp + coord_cartesian(xlim = c(0, 120), ylim = c(0, 10))
zoom_sp + theme_classic()
kd <- with(dataset1, MASS::kde2d(committers, commits, n = 50))
fig <- plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()
fig
As preliminary analysis, we compute correlation values and draw a matrix of scatter plots:
dataset_only_data<-select(dataset1, -1, -2)
M <- cor(dataset_only_data)
plot(dataset_only_data)
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(dataset_only_data, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="correlation between variables")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
corrplot(M, method = "circle")
corrplot(M, method = "ellipse")
corrplot(M, method = "number")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))
res1 <- cor.mtest(dataset_only_data, conf.level = .95)
corrplot(M, method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", # Add coefficient of correlation
tl.col = "black", tl.srt = 90, # Text label color and rotation
# Combine with significance
p.mat = res1$p, sig.level = 0.05, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag = FALSE)
We focus on some variables where we observe certain correlation. First, we observe the behaviour of commits against complexity
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
monica<-dataset1[(dataset1[,'project']=='monica'), 1:12]
sp <- ggplot(monica, aes(x=commits, y=complexity)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 255), ylim = c(1500, 4300))
sp + stat_cor(method = "pearson", label.x = 160, label.y = 4300)
## `geom_smooth()` using formula 'y ~ x'
sonar_dotnet<-dataset1[(dataset1[,'project']=='sonar-dotnet'), 1:12]
sp <- ggplot(sonar_dotnet, aes(x=committers, y=duplicated_blocks)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 15), ylim = c(35, 90))
sp + stat_cor(method = "pearson", label.x = 8, label.y = 75)
## `geom_smooth()` using formula 'y ~ x'
sonarqube<-dataset1[(dataset1[,'project']=='sonarqube'), 1:12]
sp <- ggplot(sonarqube, aes(x=committers, y=bugs)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 31), ylim = c(48, 64))
sp + stat_cor(method = "pearson", label.x = 12, label.y =58)
## `geom_smooth()` using formula 'y ~ x'
jacoco<-dataset1[(dataset1[,'project']=='jacoco'), 1:12]
sp <- ggplot(jacoco, aes(x=committers, y=code_smells)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey")
sp + stat_cor(method = "pearson", label.x = 3, label.y =230)
## `geom_smooth()` using formula 'y ~ x'
ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(aes(colour=project))
ggplot(dataset1, aes(x=commits, y=complexity, colour=project)) +
geom_point(shape=16)+
geom_smooth(se = FALSE, method = lm)
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
zoom_sp <- sp + coord_cartesian(xlim = c(0, 500), ylim = c(0, 20000))
zoom_sp
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(aes(colour=project))
sp + geom_density_2d()
sp + stat_density_2d(aes(fill = ..level..), geom="polygon") + scale_fill_gradient(low="green", high="red")
committers_density <- ggplot(dataset1, aes(x=committers, fill=project)) +
geom_density(aes(group = project,
colour = project,
fill = project),
alpha=.1) +
theme(legend.position = "right")
committers_density
zoom_sp <- committers_density + coord_cartesian(xlim = c(0, 10), ylim = c(0, 0.75))
zoom_sp
commits_density <- ggplot(dataset1, aes(x=commits, fill=project)) +
geom_density(aes(group = project,
colour = project,
fill = project),
alpha=.1) +
theme(legend.position = "right")
commits_density
zoom_sp <- commits_density + coord_cartesian(xlim = c(0, 150), ylim = c(0, 0.025))
zoom_sp
We carry out a hierarchical clustering with all the variables and take 4 clusters
ddata1 <- dist(dataset_only_data)
gdata1 <- hclust(ddata1, method = "centroid")
plot(gdata1, sub = "example", xlab = "cases", ylab = "high")
rect.hclust(tree = gdata1, k = 4, border = c("red", "blue", "green", "orange"))
clusters <- cutree(tree = gdata1, k = 4)
clusters
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 1 4 1 1 1 1 1
## [223] 1 1
dataset_clusters <- dataset_only_data
dataset_clusters$cluster <- factor(clusters)
sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity)) +
geom_point(shape=16, aes(colour=cluster))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
##K-means scaled values We carry out a K-means clustering with all the variables scaled and considering 4 clusters
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
#Method for determine best number of clusters in K-means. Look for a bend or elbow in the sum of squared error (SSE) scree plot
mydata <- dataset_only_data
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
zdata1 <- scale(dataset_only_data)
kcdata1 <- kmeans(x = zdata1, centers = 4)
kcdata1$cluster
## [1] 4 1 4 2 1 4 4 4 4 4 4 4 2 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [38] 4 4 1 4 1 4 4 4 4 1 4 1 4 1 1 1 1 1 1 4 4 4 4 4 2 4 4 4 4 4 4 4 4 2 4 4 4
## [75] 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 4 4
## [112] 2 2 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 2 1 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4
## [149] 2 1 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 2 1 4 1 1 4 4 1 1 4 4 1 1 4 4 2 2
## [186] 2 4 4 4 2 2 2 4 4 4 2 2 2 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 4 3 4 4 2 4 4
## [223] 4 2
dataset_clusters$cluster2 <- factor(kcdata1$cluster)
sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster2, shape=cluster2)) +
geom_point(shape=16, aes(colour=cluster2))+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
We performed the characterization of clusters for the k-means algorithm
par(mfrow=c(1,1))
library(lattice)
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
splom(~ dataset_clusters[1:9], groups = cluster2, data = dataset_clusters, pch = 16)
library(vioplot)
## Warning: package 'vioplot' was built under R version 3.6.3
## Loading required package: sm
## Warning: package 'sm' was built under R version 3.6.3
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster2==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Code smells per cluster")
x1 <- dataset_clusters$commits[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster2==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")
We compute correlation and scatter plots for clusters
c1<-dataset_clusters[(dataset_clusters[,'cluster2']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster2']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster2']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster2']=='4'), 1:10]
corrplot(cor(c1), method="number")
corrplot(cor(c2), method="number")
## Warning in cor(c2): the standard deviation is zero
corrplot(cor(c3), method="number")
corrplot(cor(c4), method="number")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))
res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c2, conf.level = .95)
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
corrplot(cor(c2), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
## Warning in cor(c2): the standard deviation is zero
res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster2, shape=cluster2)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
Some 3d plots with correlations of several measures by cluster
##Ploting for sonarqube project, cluster 2 and 3 differences
Ploting for sonarqube project, cluster 2 and 3 differences
library(ggplot2)
library(ggpubr)
theme_set(theme_minimal())
dataset_clusters
## # A tibble: 224 x 12
## commits changes_by_comm~ committers committers_weig~ bugs code_smells
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 115 178. 8 0.0151 41 145
## 2 137 228. 17 0.0091 37 89
## 3 96 180. 13 0.0153 42 85
## 4 5 10.8 1 1 90 110
## 5 266 136. 21 0.0059 90 110
## 6 43 178. 10 0.0287 45 151
## 7 47 225. 10 0.0351 46 151
## 8 106 2097. 12 0.0132 46 151
## 9 8 93.5 3 0.221 46 127
## 10 16 1150. 3 0.147 46 127
## # ... with 214 more rows, and 6 more variables: complexity <dbl>,
## # violations <dbl>, duplicated_blocks <dbl>, open_issues <dbl>,
## # cluster <fct>, cluster2 <fct>
clusters<-dataset_clusters
clusters$project = dataset1$project
clusters$version = dataset1$version
sonar<-clusters[(clusters[,'project']=='sonarqube'), 1:14]
p_bugs<-ggplot(dat =sonar, aes(x=version, y=bugs)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)
p_bugs
p_violations<-ggplot(dat =sonar, aes(x=version, y=violations)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)
p_violations
p_commits<-ggplot(dat =sonar, aes(x=version, y=commits)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)
p_commits
figure <- ggarrange(p_bugs, p_violations, p_commits, labels = c("a", "b", "c"), ncol = 1, nrow = 3)
figure
##k-means for normalized values
we perform the kmeans algorithm with normalized values and euclidean distance
library(vegan)
## Warning: package 'vegan' was built under R version 3.6.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.6.3
## Registered S3 methods overwritten by 'vegan':
## method from
## reorder.hclust seriation
## rev.hclust dendextend
## This is vegan 2.5-6
#data normalization
spe.norm <- decostand(dataset_only_data, "normalize")
spe.ch <- vegdist(spe.norm, "euc")
spe.ch.ward <- hclust(spe.ch, method = "ward.D")
plot(spe.ch.ward, sub = "Ward method")
#Calinski method
spe.KM.cascade <- cascadeKM(spe.norm, inf.gr = 2, sup.gr = 10, iter = 400, criterion = "ssi")
spe.KM.cascade$results
## 2 groups 3 groups 4 groups 5 groups 6 groups 7 groups
## SSE 10.10758900 7.32699459 5.64852057 4.07213686 3.4075889 2.82913890
## ssi 0.03329901 0.03870528 0.03111807 0.01295001 0.0143994 0.01426338
## 8 groups 9 groups 10 groups
## SSE 2.29278134 1.88894115 1.54124155
## ssi 0.01326347 0.01817455 0.02032346
plot(spe.KM.cascade, sortg = TRUE)
#Silhouette plot
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
dissE <- daisy(spe.norm)
sk <- silhouette(spe.kmeans$cl, dissE)
plot(sk)
#compute k-means
set.seed(1)
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
#clusters plot
spebc.ward.g <- cutree(spe.ch.ward,k = 4)
table(spe.kmeans$cluster, spebc.ward.g)
## spebc.ward.g
## 1 2 3 4
## 1 0 5 37 0
## 2 91 66 0 0
## 3 0 10 0 2
## 4 0 0 0 13
clusplot(spe.norm, spe.kmeans$cluster, color = TRUE, shade = TRUE,
labels = 2, lines = 0)
dataset_clusters$cluster3 <- factor(spe.kmeans$cluster)
We performed the characterization of clusters for the k-means algorithm
par(mfrow=c(1,1))
library(lattice)
splom(~ dataset_clusters[1:9], groups = cluster3, data = dataset_clusters, pch = 16)
library(vioplot)
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster3==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Code smells per cluster")
x1 <- dataset_clusters$commits[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster3==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")
We compute correlation and scatter plots for clusters
c1<-dataset_clusters[(dataset_clusters[,'cluster3']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster3']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster3']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster3']=='4'), 1:10]
corrplot(cor(c1), method="number")
corrplot(cor(c2), method="number")
corrplot(cor(c3), method="number")
corrplot(cor(c4), method="number")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))
res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster3, shape=cluster3)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
Some 3d plots with correlations of several measures by cluster